# compare topic model fits
# with/witout structural features
# with/without cut-off documents

# note: the data necessary to run this code contains sensitive information,
# and is therefore omitted
library(here)

here::here()

source("code/startup.R")
library(stm)

load("output omitted")

set.seed(12345)

rfs_sub$c.male.imp <- with(rfs_sub, 
                           ifelse(!is.na(c.male), 
                                  c.male, 
                                  mean(c.male, na.rm = T)))
rfs_sub$cutoff_flag <- as.numeric(nchar(rfs_sub$Candidate.Intake.Form..why) == 255)

# preprocess the documents (this code uses STM's defaults)
processed <- stm::textProcessor(rfs_sub$whyrun, metadata = rfs_sub)
out <- stm::prepDocuments(processed$documents, processed$vocab, processed$meta)
docs <- out$documents
vocab <- out$vocab
meta <-out$meta

# identify ideal number of topics
# cast wide net
sk_init_structured <- searchK(out$documents, out$vocab, 
                       K = c(10, 20, 30, 40),
                       prevalence =~ c.male.imp + c.white + USR + 
                         std.age + running + log_why_wordcount*cutoff_flag, 
                       data = meta)

sk_narrow_structured <- searchK(out$documents, out$vocab, 
                                K = c(10:20),
                                prevalence =~ c.male.imp + c.white + USR + 
                                  std.age + running + log_why_wordcount*cutoff_flag, 
                                data = meta)


sk_init_unstructured <- searchK(out$documents, out$vocab, 
                              K = c(10, 20, 30, 40),
                              data = meta)

sk_narrow_unstructured <- searchK(out$documents, out$vocab, 
                             K = c(10:20),
                             data = meta)

save(sk_init_structured, sk_init_unstructured,
     sk_narrow_structured, sk_narrow_unstructured,
     file = "output omitted")

processed.nocut <- stm::textProcessor(rfs_sub$whyrun[rfs_sub$cutoff_flag == 0], 
                                      metadata = rfs_sub[rfs_sub$cutoff_flag == 0,])
out.nocut <- stm::prepDocuments(processed.nocut$documents,
                          processed.nocut$vocab, 
                          processed.nocut$meta)
docs.nocut <- out.nocut$documents
vocab.nocut <- out.nocut$vocab
meta.nocut <-out.nocut$meta

# identify ideal number of topics
# cast wide net
sk_init_structured.nc <- searchK(out.nocut$documents, out.nocut$vocab, 
                              K = c(10, 20, 30, 40),
                              prevalence =~ c.male.imp + c.white + USR + 
                                std.age + running + log_why_wordcount, 
                              data = meta.nocut)

sk_narrow_structured.nc <- searchK(out.nocut$documents, out.nocut$vocab, 
                                K = c(10:20),
                                prevalence =~ c.male.imp + c.white + USR + 
                                  std.age + running + log_why_wordcount, 
                                data = meta.nocut)


sk_init_unstructured.nc <- searchK(out.nocut$documents, out.nocut$vocab, 
                                K = c(10, 20, 30, 40),
                                data = meta.nocut)

sk_narrow_unstructured.nc <- searchK(out.nocut$documents, out.nocut$vocab, 
                                  K = c(10:20),
                                  data = meta.nocut)
save(sk_init_structured.nc, sk_init_unstructured.nc,
     sk_narrow_structured.nc, sk_narrow_unstructured.nc,
     file = "output omitted")


message(paste0("Max semantic coherence in structured model: ", 
               sk_narrow_structured$results$semcoh[which.max(sk_narrow_structured$results$semcoh)]))

message(paste0("Max semantic coherence in unstructured model: ", 
               sk_narrow_unstructured$results$semcoh[which.max(sk_narrow_unstructured$results$semcoh)]))


message(paste0("Max semantic coherence in structured model (no cut off docs): ", 
               sk_narrow_structured.nc$results$semcoh[which.max(sk_narrow_structured.nc$results$semcoh)]))

message(paste0("Max semantic coherence in unstructured mode (no cut off docs)l: ", 
               sk_narrow_unstructured.nc$results$semcoh[which.max(sk_narrow_unstructured.nc$results$semcoh)]))
